library(tidyverse)
library(readxl)
library(ggforce)
library(concaveman)
library(knitr)
library(olsrr)
library(ranger)
library(Metrics)
library(mgcv)
library(caret)

set.seed(3630)
# Strike Zone GG Object
geom_zone <- function(top = 11/3, bottom = 3/2, linecolor = "black"){
  geom_rect(xmin = -.7083, xmax = .7083, ymin = bottom, ymax = top,
            alpha = 0, color = linecolor, linewidth = 0.75)
}

# c(0, 0, -.25, -.5, -.25))

# Home Plate GG Object
geom_plate <- function(pov = "pitcher"){
  df <- case_when(
    pov == "pitcher" ~ 
      data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, .25, .5, .25)),
    pov == "catcher" ~ 
      data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, -.25, -.5, -.25))
  )
  
  g <- geom_polygon(data = df, aes(x = x, y = y), fill = "white", color = "black", linewidth = 1.25)
  g
}

# Barrel Function
is.barrel <- function(LA, EV){
  upper <- 1.11*EV - 78.89
  lower <- -EV + 124
  outcome <- (LA >= lower) & (LA <= upper) & (EV >= 98) & (LA >= 8) & (LA <= 50)
  outcome <- replace_na(outcome, FALSE)
  outcome
}

# Normal Name Changer
swap_names <- function(name) {
  parts <- strsplit(name, ", ")[[1]]
  if (length(parts) == 2) {
    return(paste(rev(parts), collapse = " "))
  } else {
    return(name)
  }
}
seasonal <- read_csv("CSVs/season_stats.csv")

pitchers <- read_csv("CSVs/pitcher_comps.csv")

arsenal <- read_csv("CSVs/arsenal.csv")

empty <- read_csv("CSVs/bases_empty.csv")

whiff <- pitchers %>% 
  mutate(whiff = description == "swinging_strike",
         whiff = as.character(whiff)) %>% 
  filter(pitch_type != "NA",
         pitch_type != "PO")

LHP <- read_csv("CSVs/lhp_pitches.csv") %>% 
  select(-...1) %>% 
  filter(!is.na(pitch_type)) %>% 
  mutate(pitch_type = str_replace(pitch_type, "CS", "CU"),
         pitch_name = str_replace(pitch_name, "Slow Curve", "Curveball"),
         pitch_type = str_replace(pitch_type, "KC", "CU"),
         pitch_name = str_replace(pitch_name, "Knuckle Curve", "Curveball"))


whiff_l <- LHP %>% 
  mutate(whiff = description == "swinging_strike",
         whiff = as.character(whiff)) %>% 
  filter(pitch_type != "NA",
         pitch_type != "PO")



Linear Regression With/Without Interaction Terms


# Model Data (Pitch = Slider, Pitching Hand = Right)
model_data <- arsenal %>% 
  filter(pitch_type == "SL",
         pitch_hand == "R") %>% 
  mutate(ovr_break = sqrt(pitcher_break_x^2 + pitcher_break_z^2))

# Simple Linear Regression
lm_simple <- lm(xwOBA ~ 
                  pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z +
                  pitch_usage + ovr_break,
                data = model_data)


# Overview of all model combinations
model_all <- ols_step_all_possible(lm_simple)

# Backwards Elimination
lm_simple %>% ols_step_backward_p(penter = 0.2)
## 
## 
##                                Stepwise Summary                                
## -----------------------------------------------------------------------------
## Step    Variable        AIC         SBC         SBIC         R2       Adj. R2 
## -----------------------------------------------------------------------------
##  0      Full Model    -879.334    -848.821    -1829.725    0.05444    0.03714 
##  1      spin_rate     -881.055    -854.356    -1831.498    0.05365    0.03927 
## -----------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.232       RMSE                  0.064 
## R-Squared               0.054       MSE                   0.004 
## Adj. R-Squared          0.039       Coef. Var            23.574 
## Pred R-Squared          0.020       AIC                -881.055 
## MAE                     0.049       SBC                -854.356 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.077          5          0.015     3.73    0.0027 
## Residual        1.356        329          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                     Parameter Estimates                                      
## --------------------------------------------------------------------------------------------
##           model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## --------------------------------------------------------------------------------------------
##     (Intercept)     0.888         0.181                  4.899    0.000     0.531     1.244 
##     pitch_speed    -0.006         0.002       -0.259    -3.242    0.001    -0.009    -0.002 
## pitcher_break_x     0.007         0.004        0.399     1.828    0.068     0.000     0.014 
## pitcher_break_z     0.032         0.017        2.198     1.814    0.071    -0.003     0.066 
##     pitch_usage     0.000         0.000       -0.105    -1.922    0.055    -0.001     0.000 
##       ovr_break    -0.035         0.018       -2.597    -1.962    0.051    -0.070     0.000 
## --------------------------------------------------------------------------------------------
# Stepwise Selection
lm_simple %>% ols_step_both_p(prem = 0.15, pent = 0.15)
## 
## 
##                                   Stepwise Summary                                  
## ----------------------------------------------------------------------------------
## Step    Variable             AIC         SBC         SBIC         R2       Adj. R2 
## ----------------------------------------------------------------------------------
##  0      Base Model         -872.581    -864.953    -1823.341    0.00000    0.00000 
##  1      pitch_usage (+)    -875.048    -863.605    -1825.823    0.01324    0.01028 
##  2      pitch_speed (+)    -877.136    -861.879    -1827.881    0.02521    0.01934 
##  3      ovr_break (+)      -881.609    -862.538    -1832.217    0.04387    0.03520 
## ----------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.209       RMSE                  0.064 
## R-Squared               0.044       MSE                   0.004 
## Adj. R-Squared          0.035       Coef. Var            23.624 
## Pred R-Squared          0.021       AIC                -881.609 
## MAE                     0.050       SBC                -862.538 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.063          3          0.021    5.062    0.0019 
## Residual        1.370        331          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)     0.861         0.176                  4.887    0.000     0.514     1.207 
## pitch_usage    -0.001         0.000       -0.121    -2.249    0.025    -0.001     0.000 
## pitch_speed    -0.006         0.002       -0.246    -3.236    0.001    -0.009    -0.002 
##   ovr_break    -0.003         0.001       -0.193    -2.541    0.012    -0.005    -0.001 
## ----------------------------------------------------------------------------------------
# New Model
lm1 <- lm(xwOBA ~ 
            ovr_break + pitch_usage + pitch_speed,
          data = model_data)

# Interaction Linear Regression
lm_interact <- lm(xwOBA ~ 
                    pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z + pitch_usage + 
                    ovr_break + 
                    pitch_speed*spin_rate + pitch_speed*pitch_usage + pitch_speed*ovr_break +
                    pitch_speed*pitcher_break_x + pitch_speed*pitcher_break_z + 
                    spin_rate*ovr_break + spin_rate*pitcher_break_x + spin_rate*pitcher_break_z +
                    pitch_usage*ovr_break + pitch_usage*spin_rate,
                  data = model_data)


# model_interact_all <- ols_step_all_possible(lm_interact)

# Stepwise Selection
lm_interact %>% ols_step_both_p(pent = 0.15, prem = 0.05)
## 
## 
##                                           Stepwise Summary                                           
## ---------------------------------------------------------------------------------------------------
## Step    Variable                             AIC         SBC         SBIC         R2       Adj. R2  
## ---------------------------------------------------------------------------------------------------
##  0      Base Model                         -872.581    -864.953    -1826.673    0.00000     0.00000 
##  1      pitch_speed:pitch_usage (+)        -875.879    -864.437    -1831.752    0.01569     0.01273 
##  2      pitcher_break_z (+)                -874.227    -858.970    -1831.890    0.01671     0.01079 
##  3      pitch_speed:pitch_usage (-)        -870.698    -859.256    -1826.584    0.00035    -0.00265 
##  4      pitch_speed (+)                    -877.840    -862.584    -1835.493    0.02726     0.02140 
##  5      pitch_usage (+)                    -881.134    -862.063    -1840.555    0.04251     0.03383 
##  6      pitch_usage:ovr_break (+)          -882.750    -859.865    -1843.940    0.05279     0.04131 
##  7      ovr_break (+)                      -882.634    -855.935    -1845.597    0.05810     0.04379 
##  8      pitcher_break_z (-)                -884.460    -861.575    -1845.642    0.05761     0.04619 
##  9      spin_rate (+)                      -882.608    -855.909    -1845.571    0.05803     0.04371 
##  10     ovr_break (-)                      -874.597    -851.713    -1835.823    0.02946     0.01769 
##  11     pitch_speed:ovr_break (+)          -882.165    -855.466    -1845.130    0.05678     0.04245 
##  12     pitch_usage:ovr_break (-)          -879.449    -856.564    -1840.653    0.04341     0.03181 
##  13     pitcher_break_x (+)                -877.639    -850.940    -1840.627    0.04395     0.02942 
##  14     spin_rate (-)                      -879.454    -856.569    -1840.658    0.04342     0.03183 
##  15     pitch_speed:pitcher_break_z (+)    -880.623    -853.924    -1843.596    0.05243     0.03803 
## ---------------------------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.229       RMSE                  0.064 
## R-Squared               0.052       MSE                   0.004 
## Adj. R-Squared          0.038       Coef. Var            23.589 
## Pred R-Squared          0.019       AIC                -880.623 
## MAE                     0.049       SBC                -853.924 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.075          5          0.015    3.641    0.0032 
## Residual        1.358        329          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                           Parameter Estimates                                            
## --------------------------------------------------------------------------------------------------------
##                       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## --------------------------------------------------------------------------------------------------------
##                 (Intercept)     0.729         0.155                  4.705    0.000     0.424     1.034 
##                 pitch_speed    -0.004         0.002       -0.179    -2.577    0.010    -0.007    -0.001 
##                 pitch_usage    -0.001         0.000       -0.106    -1.946    0.053    -0.001     0.000 
##             pitcher_break_x     0.007         0.004        0.404     1.778    0.076    -0.001     0.014 
##       pitch_speed:ovr_break     0.000         0.000       -2.250    -1.904    0.058    -0.001     0.000 
## pitch_speed:pitcher_break_z     0.000         0.000        1.935     1.768    0.078     0.000     0.001 
## --------------------------------------------------------------------------------------------------------
# Output removes ALL interactions for p < 0.05
# Keeps same as simple LM pitcher_break_z + pitch_speed + pitch_usage

lm_interact %>% ols_step_both_p(pent = 0.15, prem = 0.10)
## 
## 
##                                           Stepwise Summary                                           
## ---------------------------------------------------------------------------------------------------
## Step    Variable                             AIC         SBC         SBIC         R2       Adj. R2  
## ---------------------------------------------------------------------------------------------------
##  0      Base Model                         -872.581    -864.953    -1826.673    0.00000     0.00000 
##  1      pitch_speed:pitch_usage (+)        -875.879    -864.437    -1831.752    0.01569     0.01273 
##  2      pitcher_break_z (+)                -874.227    -858.970    -1831.890    0.01671     0.01079 
##  3      pitch_speed:pitch_usage (-)        -870.698    -859.256    -1826.584    0.00035    -0.00265 
##  4      pitch_speed (+)                    -877.840    -862.584    -1835.493    0.02726     0.02140 
##  5      pitch_usage (+)                    -881.134    -862.063    -1840.555    0.04251     0.03383 
##  6      pitch_usage:ovr_break (+)          -882.750    -859.865    -1843.940    0.05279     0.04131 
##  7      ovr_break (+)                      -882.634    -855.935    -1845.597    0.05810     0.04379 
##  8      pitcher_break_z (-)                -884.460    -861.575    -1845.642    0.05761     0.04619 
##  9      spin_rate (+)                      -882.608    -855.909    -1845.571    0.05803     0.04371 
##  10     ovr_break (-)                      -874.597    -851.713    -1835.823    0.02946     0.01769 
##  11     pitch_speed:ovr_break (+)          -882.165    -855.466    -1845.130    0.05678     0.04245 
##  12     pitch_usage:ovr_break (-)          -879.449    -856.564    -1840.653    0.04341     0.03181 
##  13     pitcher_break_x (+)                -877.639    -850.940    -1840.627    0.04395     0.02942 
##  14     spin_rate (-)                      -879.454    -856.569    -1840.658    0.04342     0.03183 
##  15     pitch_speed:pitcher_break_z (+)    -880.623    -853.924    -1843.596    0.05243     0.03803 
## ---------------------------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                           
## ----------------------------------------------------------------
## R                       0.229       RMSE                  0.064 
## R-Squared               0.052       MSE                   0.004 
## Adj. R-Squared          0.038       Coef. Var            23.589 
## Pred R-Squared          0.019       AIC                -880.623 
## MAE                     0.049       SBC                -853.924 
## ----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.075          5          0.015    3.641    0.0032 
## Residual        1.358        329          0.004                    
## Total           1.433        334                                   
## -------------------------------------------------------------------
## 
##                                           Parameter Estimates                                            
## --------------------------------------------------------------------------------------------------------
##                       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## --------------------------------------------------------------------------------------------------------
##                 (Intercept)     0.729         0.155                  4.705    0.000     0.424     1.034 
##                 pitch_speed    -0.004         0.002       -0.179    -2.577    0.010    -0.007    -0.001 
##                 pitch_usage    -0.001         0.000       -0.106    -1.946    0.053    -0.001     0.000 
##             pitcher_break_x     0.007         0.004        0.404     1.778    0.076    -0.001     0.014 
##       pitch_speed:ovr_break     0.000         0.000       -2.250    -1.904    0.058    -0.001     0.000 
## pitch_speed:pitcher_break_z     0.000         0.000        1.935     1.768    0.078     0.000     0.001 
## --------------------------------------------------------------------------------------------------------
# New Model with Interactions (p -value < 0.10 threshhold)
lm2 <- lm(xwOBA ~ 
            pitch_speed + pitch_usage + 
            pitch_speed*ovr_break + pitch_speed*pitcher_break_z,
          data = model_data)


# Trimmed Data
model_results <- model_data %>% 
  select(first_name, last_name, 
         pitch_speed, pitch_usage, pitcher_break_z, ovr_break, xwOBA)

# Comparing Model Predictions
# lm1 = simple
# lm2 = interactions

model_results <- model_results %>% 
  mutate(lm1 = predict(lm1, model_results)) %>% 
  mutate(lm2 = predict(lm2, model_results))

# R and RMSE of Simple Linear Model
with(model_results, cor(xwOBA, lm1))
## [1] 0.2094431
with(model_results, rmse(xwOBA, lm1))
## [1] 0.06394718
# R and RMSE of Interactions Linear Model
with(model_results, cor(xwOBA, lm2))
## [1] 0.2151546
with(model_results, rmse(xwOBA, lm2))
## [1] 0.06386603
model_results %>% 
  select(xwOBA, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = xwOBA, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  theme_classic() +
  labs(title = "Linear Models for RHP",
       x = "Observed",
       y = "Predicted",
       color = "Model")

# Model Data (Pitch = Slider, Pitching Hand = Left)
model_data_l <- arsenal %>% 
  filter(pitch_type == "SL",
         pitch_hand == "L") %>% 
  mutate(ovr_break = sqrt(pitcher_break_x^2 + pitcher_break_z^2))

# Simple Linear Regression
lm_simple_l <- lm(xwOBA ~ 
                    pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z +
                    pitch_usage + ovr_break,
                  data = model_data_l)



# Overview of all model combinations
model_all_l <- ols_step_all_possible(lm_simple_l)

# Backwards Elimination
lm_simple_l %>% ols_step_backward_p(penter = 0.15)
## 
## 
##                                 Stepwise Summary                                
## ------------------------------------------------------------------------------
## Step    Variable         AIC         SBC         SBIC        R2       Adj. R2  
## ------------------------------------------------------------------------------
##  0      Full Model     -205.439    -184.518    -491.033    0.04617    -0.01471 
##  1      pitch_speed    -207.131    -188.825    -492.913    0.04325    -0.00710 
##  2      spin_rate      -208.977    -193.287    -494.917    0.04180     0.00187 
##  3      pitch_usage    -209.989    -196.914    -496.149    0.03238     0.00245 
## ------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                           Model Summary                           
## -----------------------------------------------------------------
## R                        0.180       RMSE                  0.081 
## R-Squared                0.032       MSE                   0.007 
## Adj. R-Squared           0.002       Coef. Var            29.106 
## Pred R-Squared          -0.036       AIC                -209.989 
## MAE                      0.057       SBC                -196.914 
## -----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression      0.022          3          0.007    1.082    0.3605 
## Residual        0.670         97          0.007                    
## Total           0.692        100                                   
## -------------------------------------------------------------------
## 
##                                     Parameter Estimates                                     
## -------------------------------------------------------------------------------------------
##           model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------
##     (Intercept)     0.336         0.068                  4.936    0.000     0.201    0.471 
## pitcher_break_x    -0.013         0.009       -0.653    -1.425    0.157    -0.031    0.005 
## pitcher_break_z    -0.073         0.046       -4.509    -1.604    0.112    -0.164    0.017 
##       ovr_break     0.073         0.046        4.835     1.568    0.120    -0.019    0.165 
## -------------------------------------------------------------------------------------------
# Stepwise Selection
# lm_simple_l %>% ols_step_both_p(prem = 0.15, pent = 0.15)

# New Model
lm1_l <- lm(xwOBA ~ 
              pitcher_break_x + pitcher_break_z +
              ovr_break,
            data = model_data_l)


# Model Data (Pitch = Slider, Pitching Hand = Left)

# Interaction Linear Regression
lm_interact_l <- lm(xwOBA ~ 
                      pitch_speed + spin_rate + pitcher_break_x + pitcher_break_z + pitch_usage + 
                      ovr_break + 
                      pitch_speed*spin_rate + pitch_speed*pitch_usage + pitch_speed*ovr_break +
                      pitch_speed*pitcher_break_x + pitch_speed*pitcher_break_z + 
                      spin_rate*ovr_break + spin_rate*pitcher_break_x + spin_rate*pitcher_break_z +
                      pitch_usage*ovr_break + pitch_usage*spin_rate,
                    data = model_data_l)


# model_interact_all <- ols_step_all_possible(lm_interact)

# Stepwise Selection
# lm_interact_l %>% ols_step_both_p(pent = 0.15, prem = 0.05)

# Output removes ALL interactions for p < 0.05
# Keeps same as simple LM pitcher_break_z + pitch_speed + pitch_usage

# lm_interact_l %>% ols_step_both_p(pent = 0.20, prem = 0.05)

# New Model with Interactions (p -value < 0.10 threshhold)
lm2_l <- lm(xwOBA ~ 
              spin_rate*pitcher_break_z,
            data = model_data_l)


# Trimmed Data
model_results_l <- model_data_l %>% 
  select(first_name, last_name, 
         pitch_speed, pitch_usage, pitcher_break_z, ovr_break, xwOBA)

# Comparing Model Predictions
# lm1 = simple
# lm2 = interactions

model_results_l <- model_results_l %>% 
  mutate(lm1 = predict(lm1, model_results_l)) %>% 
  mutate(lm2 = predict(lm2, model_results_l))

# R and RMSE of Simple Linear Model
with(model_results_l, cor(xwOBA, lm1))
## [1] 0.04569634
with(model_results_l, rmse(xwOBA, lm1))
## [1] 0.08383695
# R and RMSE of Interactions Linear Model
with(model_results_l, cor(xwOBA, lm2))
## [1] 0.02805347
with(model_results_l, rmse(xwOBA, lm2))
## [1] 0.08418886
model_results_l %>% 
  select(xwOBA, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = xwOBA, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  theme_classic() +
  labs(title = "Linear Models for LHP",
       x = "Observed",
       y = "Predicted",
       color = "Model")

# Pitch by Pitch Data (Sliders)
pitches <- pitchers %>% 
  filter(pitch_type == "SL") %>% 
  mutate(pfx_x = pfx_x*12,
         pfx_z = pfx_z*12,
         ovr_break = round(sqrt(pfx_x^2 + pfx_z^2), 3))

# Simple Linear Regression
lm_pitches <- lm(delta_run_exp ~ 
                   release_speed + release_spin_rate + pfx_x + pfx_z +
                   ovr_break + release_extension,
                 data = pitches)



# Stepwise Selection
# lm_pitches %>% ols_step_both_p(prem = 0.25, pent = 0.15)

# New Model
lm1_pitches <- lm(delta_run_exp ~ 
                    release_spin_rate + pfx_x + release_extension,
                  data = pitches)


# Interaction Linear Regression
lm_interact_pitches <- lm(delta_run_exp ~ 
                            release_speed + release_spin_rate + pfx_x + pfx_z +
                            ovr_break + release_extension +
                            release_speed*release_spin_rate + release_speed*pfx_x +
                            release_speed*pfx_z + release_speed*ovr_break +
                            release_speed*release_extension + 
                            release_spin_rate*pfx_x + release_spin_rate*pfx_z + 
                            release_spin_rate*ovr_break + release_spin_rate*release_extension +
                            release_extension*pfx_x + release_extension*pfx_z + release_extension*ovr_break,
                          data = pitches)


# model_interact_all <- ols_step_all_possible(lm_interact)

# Stepwise Selection
lm_interact_pitches %>% ols_step_both_p(pent = 0.15, prem = 0.15)
## 
## 
##                                        Stepwise Summary                                        
## ---------------------------------------------------------------------------------------------
## Step    Variable                         AIC        SBC         SBIC         R2       Adj. R2 
## ---------------------------------------------------------------------------------------------
##  0      Base Model                     -72.271    -59.891    -10299.956    0.00000    0.00000 
##  1      release_spin_rate:pfx_x (+)    -73.773    -55.203    -10301.429    0.00097    0.00069 
##  2      release_extension (+)          -73.562    -48.803    -10301.189    0.00147    0.00091 
##  3      ovr_break (+)                  -73.409    -42.460    -10301.005    0.00198    0.00115 
##  4      release_speed (+)              -71.418    -34.280    -10298.986    0.00198    0.00087 
##  5      ovr_break (-)                  -71.620    -40.671    -10299.220    0.00148    0.00065 
##  6      pfx_x (+)                      -71.805    -34.667    -10299.372    0.00209    0.00098 
##  7      release_extension (-)          -72.690    -41.741    -10300.287    0.00178    0.00095 
##  8      pfx_z (+)                      -70.939    -33.800    -10298.508    0.00185    0.00074 
##  9      pfx_x (-)                      -70.480    -39.531    -10298.082    0.00117    0.00033 
##  10     release_speed:pfx_z (+)        -74.421    -37.282    -10301.980    0.00281    0.00170 
##  11     release_spin_rate:pfx_x (-)    -75.339    -44.391    -10302.931    0.00251    0.00168 
##  12     release_spin_rate (+)          -75.115    -37.976    -10302.672    0.00300    0.00190 
##  13     pfx_z (-)                      -71.481    -40.532    -10299.081    0.00144    0.00061 
## ---------------------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                           Model Summary                            
## ------------------------------------------------------------------
## R                        0.038       RMSE                   0.239 
## R-Squared                0.001       MSE                    0.057 
## Adj. R-Squared           0.001       Coef. Var          -3503.793 
## Pred R-Squared          -0.001       AIC                  -71.481 
## MAE                      0.118       SBC                  -40.532 
## ------------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares          DF    Mean Square      F        Sig. 
## --------------------------------------------------------------------
## Regression      0.298           3          0.099    1.736    0.1574 
## Residual      206.296        3600          0.057                    
## Total         206.595        3603                                   
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##               model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -----------------------------------------------------------------------------------------------
##         (Intercept)     0.095         0.129                  0.738    0.461    -0.158    0.348 
##       release_speed    -0.002         0.002       -0.022    -1.209    0.227    -0.005    0.001 
##   release_spin_rate     0.000         0.000        0.026     1.325    0.185     0.000    0.000 
## release_speed:pfx_z     0.000         0.000       -0.012    -0.583    0.560     0.000    0.000 
## -----------------------------------------------------------------------------------------------
lm2_pitches <- lm(delta_run_exp ~ 
                    release_spin_rate*pfx_x + release_extension,
                  data = pitches)


model_results_pitches <- pitches %>% 
  select(delta_run_exp, release_speed, release_spin_rate, pfx_x, pfx_z,
         release_extension) %>% 
  mutate(lm1 = predict(lm1_pitches, pitches),
         lm2 = predict(lm2_pitches, pitches))

# R and RMSE of Simple Linear Model

model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.04470764
model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.2391844
# R and RMSE of Interaction Linear Model
model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm2))
## [1] 0.04598308
model_results_pitches %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm2))
## [1] 0.2391706
# Graph
model_results_pitches %>% 
  select(delta_run_exp, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = delta_run_exp, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  theme_classic() +
  labs(title = "Linear Models for Pitch-by-Pitch Data",
       subtitle = "Predicting Run Expectancy Added",
       caption = "Pitchers: Scherzer, Taillon, Keller, Manoah, Gallen, Garcia, Gray",
       x = "Observed",
       y = "Predicted",
       color = "Model")

model_results_pitches_2 <- pitches %>% 
  select(delta_run_exp, release_speed, release_spin_rate, pfx_x, pfx_z,
         release_extension, ID) %>% 
  mutate(lm1 = predict(lm1_pitches, pitches),
         lm2 = predict(lm2_pitches, pitches))

# Graph
model_results_pitches_2 %>% 
  select(ID,delta_run_exp, lm1, lm2) %>%
  pivot_longer(cols = lm1:lm2, 
               names_to = "model",
               values_to = "pred") %>% 
  mutate(model = str_replace(model, "lm1", "Simple LM"),
         model = str_replace(model, "lm2", "Interaction LM")) %>%
  ggplot(aes(x = delta_run_exp, y = pred, color = model)) +
  geom_point(shape = 18, size = 1.5, alpha = 0.75) + 
  geom_smooth(se = FALSE) +
  scale_color_manual(values = c("navyblue", "skyblue")) +
  facet_wrap(~ ID, ncol = 1) +
  theme_classic() +
  labs(title = "Linear Models for Pitch-by-Pitch Data",
       subtitle = "Predicting Run Expectancy Added",
       caption = "Pitchers: Scherzer, Taillon, Keller, Manoah, Gallen, Garcia, Gray",
       x = "Observed",
       y = "Predicted",
       color = "Model")

# Correlations

model_results_pitches_2 %>% 
  filter(ID == "Great") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.06192874
model_results_pitches_2 %>% 
  filter(ID == "Decent") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.01215481
model_results_pitches_2 %>% 
  filter(ID == "Bad") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(cor(delta_run_exp, lm1))
## [1] 0.03652784
# RMSE

model_results_pitches_2 %>% 
  filter(ID == "Great") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.1927567
model_results_pitches_2 %>% 
  filter(ID == "Decent") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.2850569
model_results_pitches_2 %>% 
  filter(ID == "Bad") %>% 
  filter(!is.na(release_spin_rate),
         !is.na(pfx_x),
         !is.na(release_extension),
         !is.na(delta_run_exp)) %>% 
  with(rmse(delta_run_exp, lm1))
## [1] 0.2510364
pitchers %>% 
  # filter(pitch_type == "SL") %>%
  ggplot(aes(x = plate_z)) +
  geom_histogram(binwidth = 0.15, color = "white")

pitchers %>% 
  ggplot(aes(x = delta_run_exp)) +
  geom_histogram(binwidth = 0.15, color = "white")

pitchers %>% 
  filter(pitch_type %in% c("FF", "SL", "CH")) %>%
  mutate(pitch_dist = sqrt(plate_x^2 + (2.5 - plate_z)^2)) %>% 
  ggplot(aes(x = pitch_dist, color = ID)) +
  geom_density() +
  facet_wrap(~ pitch_type,
             ncol = 1)

pitchers %>% 
  filter(pitch_type %in% c("FF", "SL", "CH")) %>%
  mutate(pitch_dist = sqrt(plate_x^2 + (2.5 - plate_z)^2)) %>%
  ggplot(aes(x = pitch_dist, y = delta_run_exp)) +
  geom_point(alpha = 0.15)



Modeling with Difference Variables


# Model?
model <- lm(delta_run_exp ~ dist + speed_change + break_change, 
            data = pitchers)

preds <- pitchers %>% 
  mutate(predicted = predict(model, pitchers)) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

acc <- preds %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_smooth()

acc

acc +
  facet_wrap(~ pitch_type)

acc +
  facet_wrap(~ ID)

data_ff <- pitchers %>% 
  filter(pitch_type == "FF",
         !is.na(break_change)) %>%  
  filter(pitch_type != "PO")

model_ff <- train(
  delta_run_exp ~ dist + speed_change + break_change + release_speed + pfx_x + pfx_z,
  data = data_ff,
  method = "ranger",
  trControl = trainControl(method = "cv", number = 5))

preds_ff <- cbind(data_ff, predict(model_ff)) %>% 
  as.data.frame() %>% 
  rename(observed = delta_run_exp,
         predicted = "predict(model_ff)") %>% 
  select(ID, zone, pitch_type, observed, predicted)

preds_ff %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  geom_abline(slope = 1, intercept = 0) +
  coord_fixed() +
  labs(title = "Fastball RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_ff$observed, preds_ff$predicted), 4)))

preds_ff %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9625188
data_ff <- pitchers %>% 
  filter(pitch_type == "FF",
         !is.na(break_change))

model_ff <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_ff, mtry = 2)

preds_ff <- data_ff %>% 
  mutate(predicted = predict(model_ff, data_ff)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_ff %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Fastball RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_ff$observed, preds_ff$predicted), 4)))

preds_ff %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9710876
data_si <- pitchers %>% 
  filter(pitch_type == "SI",
         !is.na(break_change))

model_si <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_si, mtry = 2)

preds_si <- data_si %>% 
  mutate(predicted = predict(model_si, data_si)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_si %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Sinker RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_si$observed, preds_si$predicted), 4)))

preds_si %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9591401
data_ch <- pitchers %>% 
  filter(pitch_type == "CH",
         !is.na(break_change))

model_ch <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_ch, mtry = 2)

preds_ch <- data_ch %>% 
  mutate(predicted = predict(model_ch, data_ch)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_ch %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Change-Up RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_ch$observed, preds_ch$predicted), 4)))

preds_ch %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9661838
data_sl <- pitchers %>% 
  filter(pitch_type == "SL",
         !is.na(break_change))

model_sl <- ranger(delta_run_exp ~ dist + speed_change + break_change, 
            data = data_sl, mtry = 2)

preds_sl <- data_sl %>% 
  mutate(predicted = predict(model_sl, data_sl)$predictions) %>% 
  rename(observed = delta_run_exp) %>% 
  select(ID, zone, pitch_type, observed, predicted) %>% 
  filter(pitch_type != "PO")

preds_sl %>% 
  ggplot(aes(x = observed, y = predicted)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Slider RF Model",
       caption = paste0("RMSE: ", round(rmse(preds_sl$observed, preds_sl$predicted), 4)))

preds_sl %>% 
  filter(!is.na(observed),
         !is.na(predicted)) %>% 
  with(cor(observed, predicted))
## [1] 0.9677164
# 
# Actmodel <- train(delta_run_exp ~ dist + speed_change + break_change,
#                   data = data_sl, method = "ranger", 
#                   trControl = trainControl(method = "cv", number = 10, verboseIter = TRUE), preProcess = c("knnImpute"))
# plot(Actmodel$finalModel$forest)


Logistic Regression Whiff RHP

# Slider Logistic Model
whiff_sl <- whiff %>% 
  filter(pitch_type == "SL") %>% 
  mutate(whiff = str_replace(whiff, "TRUE", "1"),
         whiff = str_replace(whiff, "FALSE", "0"),
         whiff = as.numeric(whiff))

# Original Model
model1 <- glm(whiff ~ release_speed + spin_axis + pfx_x + pfx_z + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + dist,
              data = whiff_sl, family = binomial)

# Reduced Model
model1 <- glm(whiff ~ release_speed + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + dist,
              data = whiff_sl, family = binomial)

summary(model1)
## 
## Call:
## glm(formula = whiff ~ release_speed + plate_x + plate_z + release_spin_rate + 
##     speed_change + break_change + pfx_total + dist, family = binomial, 
##     data = whiff_sl)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        9.0550541  3.0643314   2.955  0.00313 ** 
## release_speed     -0.0924999  0.0353807  -2.614  0.00894 ** 
## plate_x            0.5454560  0.0870095   6.269 3.64e-10 ***
## plate_z           -0.2438012  0.0988763  -2.466  0.01367 *  
## release_spin_rate -0.0006138  0.0002186  -2.808  0.00498 ** 
## speed_change       0.1267177  0.0611358   2.073  0.03820 *  
## break_change       1.0446186  0.3722565   2.806  0.00501 ** 
## pfx_total         -0.9888782  0.2453062  -4.031 5.55e-05 ***
## dist              -0.5004963  0.1216086  -4.116 3.86e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3093.3  on 3604  degrees of freedom
## Residual deviance: 3003.3  on 3596  degrees of freedom
##   (6 observations deleted due to missingness)
## AIC: 3021.3
## 
## Number of Fisher Scoring iterations: 5
preds <- whiff_sl %>% 
  mutate(prediction_log = predict(model1, whiff_sl),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

preds %>% 
  ggplot(aes(x = as.character(whiff), y = prediction)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.1, width = 0.1, height = 0)

preds %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Whiff proportion by predicted whiff value",
       subtitle = "Whiff predictions have a 1% bin width")

preds %>% 
  arrange(desc(prediction)) %>% 
  head(10)
## # A tibble: 10 × 85
##    pitch_type game_date  release_speed release_pos_x release_pos_z player_name 
##    <chr>      <date>             <dbl>         <dbl>         <dbl> <chr>       
##  1 SL         2022-06-03          85.5         -2.02          5.32 Gray, Josiah
##  2 SL         2022-04-13          83.4         -1.79          5.18 Gray, Josiah
##  3 SL         2022-04-13          84.7         -1.75          5.25 Gray, Josiah
##  4 SL         2022-04-13          83.3         -1.71          5.09 Gray, Josiah
##  5 SL         2022-04-08          84.5         -1.54          5.17 Gray, Josiah
##  6 SL         2022-04-13          84.1         -1.74          5.19 Gray, Josiah
##  7 SL         2022-04-08          83.7         -1.61          5.2  Gray, Josiah
##  8 SL         2022-04-08          86.2         -1.84          5.4  Gray, Josiah
##  9 SL         2022-04-26          85.7         -1.79          5.28 Gray, Josiah
## 10 SL         2022-04-08          85.1         -1.69          5.19 Gray, Josiah
## # ℹ 79 more variables: batter <dbl>, pitcher...8 <dbl>, events <chr>,
## #   description <chr>, zone <dbl>, des <chr>, game_type <chr>, stand <chr>,
## #   p_throws <chr>, home_team <chr>, away_team <chr>, type <chr>,
## #   hit_location <dbl>, bb_type <chr>, balls <dbl>, strikes <dbl>,
## #   game_year <dbl>, pfx_x <dbl>, pfx_z <dbl>, plate_x <dbl>, plate_z <dbl>,
## #   on_3b <dbl>, on_2b <dbl>, on_1b <dbl>, outs_when_up <dbl>, inning <dbl>,
## #   inning_topbot <chr>, hc_x <dbl>, hc_y <dbl>, tfs_deprecated <lgl>, …
whiff %>% 
  mutate(count = paste0(balls, "-", strikes)) %>% 
  filter(pitch_type == "SL") %>% 
  ggplot(aes(y = whiff, x = pfx_z*12)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~count) +
  labs(title = "Whiff vs. Non-Whiff by Vertical Movement",
       x = "Induced Vertical Movement (in.)",
       y = "Outcome") +
  NULL

# Sliders
whiff %>% 
  filter(pitch_type =="SL") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(stand), rows = vars(whiff)) +
  theme_bw()

# Fastballs
whiff %>% 
  filter(pitch_type =="FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(stand), rows = vars(whiff)) +
  theme_bw()

# Change-Ups
whiff %>% 
  filter(pitch_type =="CH") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(stand), rows = vars(whiff)) +
  theme_bw()

whiff %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(stand == "R",
         pitch_type == "SL",
         prev_pitch  %in% c("FF", "CH", "SL", "CU"),
         player_name == "Scherzer, Max") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(prev_pitch), rows = vars(whiff)) +
  theme_bw()

whiff %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(stand == "R",
         pitch_type == "FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(player_name), rows = vars(whiff)) +
  theme_bw()

zoned <- whiff %>% 
  mutate(loc_x = round(plate_x*3, 0),
         loc_y = round(plate_z*3, 0))

zoned %>% 
  filter(pitch_type == "FF",
         plate_z > 0 & plate_z < 6,
         plate_x > -1.5 & plate_x < 1.5) %>% 
  summarize(whiff_perc = mean(whiff == "TRUE"),
            pitches = n(),
            .by = c(loc_x, loc_y, player_name)) %>% 
  filter(pitches >= 10) %>% 
  ggplot(aes(x = -loc_x, y = loc_y, fill = whiff_perc)) + 
  geom_tile() +
  scale_fill_gradient(low = "gray", high = "red") +
  facet_wrap(~ player_name) +
  coord_fixed() +
  theme_bw()


Logistic Regression Whiff LHP

Logistic Regression

# Slider Logistic Model
whiff_sl2 <- whiff_l %>% 
  filter(pitch_type == "SL") %>% 
  mutate(whiff = str_replace(whiff, "TRUE", "1"),
         whiff = str_replace(whiff, "FALSE", "0"),
         whiff = as.numeric(whiff))

# Original Model
model2 <- glm(whiff ~ pitch_speed + spin_axis + pfx_x + pfx_z + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + distance,
              data = whiff_sl2, family = binomial)

# Reduced Model
model2 <- glm(whiff ~ pitch_speed + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + distance,
              data = whiff_sl2, family = binomial)

summary(model2)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + plate_x + plate_z + release_spin_rate + 
##     speed_change + break_change + pfx_total + distance, family = binomial, 
##     data = whiff_sl2)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -5.0425752  0.9277708  -5.435 5.47e-08 ***
## pitch_speed        0.0501462  0.0103594   4.841 1.29e-06 ***
## plate_x           -0.3698816  0.0469998  -7.870 3.55e-15 ***
## plate_z           -0.4913540  0.0488865 -10.051  < 2e-16 ***
## release_spin_rate  0.0002894  0.0001245   2.325   0.0201 *  
## speed_change       0.0038569  0.0228735   0.169   0.8661    
## break_change       0.2291186  0.2001940   1.144   0.2524    
## pfx_total         -0.1649991  0.1196077  -1.380   0.1677    
## distance          -0.7165732  0.0642529 -11.152  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11359  on 13558  degrees of freedom
## Residual deviance: 11095  on 13550  degrees of freedom
##   (98 observations deleted due to missingness)
## AIC: 11113
## 
## Number of Fisher Scoring iterations: 5
preds2 <- whiff_sl2 %>% 
  mutate(prediction_log = predict(model2, whiff_sl2),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

preds2 %>% 
  ggplot(aes(x = as.character(whiff), y = prediction)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.1, width = 0.1, height = 0)

preds2 %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Whiff proportion by predicted whiff value",
       subtitle = "Whiff predictions have a 1% bin width")

whiff_l %>% 
  mutate(count = paste0(balls, "-", strikes)) %>% 
  filter(pitch_type == "SL") %>% 
  ggplot(aes(y = whiff, x = pfx_z*12)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~count) +
  labs(title = "Whiff vs. Non-Whiff by Vertical Movement",
       x = "Induced Vertical Movement (in.)",
       y = "Outcome") +
  NULL

# Sliders
whiff_l %>% 
  filter(pitch_type =="SL") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

# Fastballs
whiff_l %>% 
  filter(pitch_type =="FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

# Change-Ups
whiff_l %>% 
  filter(pitch_type =="CH") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

whiff_l %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(hitter == "R",
         pitch_type == "SL",
         prev_pitch  %in% c("FF", "CH", "SL", "CU"),
         player_name == "Fried, Max") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(prev_pitch), rows = vars(whiff)) +
  theme_bw()

whiff_l %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(hitter == "R",
         pitch_type == "FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(player_name), rows = vars(whiff)) +
  theme_bw()

zoned2 <- whiff_l %>% 
  mutate(loc_x = round(plate_x*3, 0),
         loc_y = round(plate_z*3, 0))

zoned2 %>% 
  filter(pitch_type == "FF",
         plate_z > 0 & plate_z < 6,
         plate_x > -1.5 & plate_x < 1.5) %>% 
  summarize(whiff_perc = mean(whiff == "TRUE"),
            pitches = n(),
            .by = c(loc_x, loc_y, player_name)) %>% 
  filter(pitches >= 10) %>% 
  ggplot(aes(x = -loc_x, y = loc_y, fill = whiff_perc)) + 
  geom_tile() +
  scale_fill_gradient(low = "gray", high = "red") +
  facet_wrap(~ player_name) +
  coord_fixed() +
  theme_bw()